rm(list=ls()) # clean env
options(scipen=999) # seed randomness
library(tidyr)
library(ggplot2)
library(ggExtra)
library(MASS)
library(car)
library(nnet)
library(caret)
library(caTools)
library(rstatix)
library(Metrics)
library(pROC)
library(corrr)
library(ggcorrplot)
library(ggpubr)
library(VGAM)
set.seed(0)
data <- read.table('winequality-red.csv', sep=",", header=T, stringsAsFactors=T)
head(data)
n <- nrow(data) #n#
p <- ncol(data) #p#
summary(data)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900 Min. :0.01200
1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900 1st Qu.:0.07000
Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200 Median :0.07900
Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539 Mean :0.08747
3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600 3rd Qu.:0.09000
Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500 Max. :0.61100
free.sulfur.dioxide total.sulfur.dioxide density pH
Min. : 1.00 Min. : 6.00 Min. :0.9901 Min. :2.740
1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.210
Median :14.00 Median : 38.00 Median :0.9968 Median :3.310
Mean :15.87 Mean : 46.47 Mean :0.9967 Mean :3.311
3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.400
Max. :72.00 Max. :289.00 Max. :1.0037 Max. :4.010
sulphates alcohol quality
Min. :0.3300 Min. : 8.40 Min. :3.000
1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
Median :0.6200 Median :10.20 Median :6.000
Mean :0.6581 Mean :10.42 Mean :5.636
3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
Max. :2.0000 Max. :14.90 Max. :8.000
unique(data$fixed.acidity)
[1] 7.4 7.8 11.2 7.9 7.3 7.5 6.7 5.6 8.9 8.5 8.1 7.6 6.9 6.3 7.1 8.3
[17] 5.2 5.7 8.8 6.8 4.6 7.7 8.7 6.4 6.6 8.6 10.2 7.0 7.2 9.3 8.0 9.7
[33] 6.2 5.0 4.7 8.4 10.1 9.4 9.0 8.2 6.1 5.8 9.2 11.5 5.4 9.6 12.8 11.0
[49] 11.6 12.0 15.0 10.8 11.1 10.0 12.5 11.8 10.9 10.3 11.4 9.9 10.4 13.3 10.6 9.8
[65] 13.4 10.7 11.9 12.4 12.2 13.8 9.1 13.5 10.5 12.6 14.0 13.7 9.5 12.7 12.3 15.6
[81] 5.3 11.3 13.0 6.5 12.9 14.3 15.5 11.7 13.2 15.9 12.1 5.1 4.9 5.9 6.0 5.5
unique(data$voltile.acidity)
NULL
unique(data$citric.acid)
[1] 0.00 0.04 0.56 0.06 0.02 0.36 0.08 0.29 0.18 0.19 0.28 0.51 0.48 0.31 0.21 0.11
[17] 0.14 0.16 0.24 0.07 0.12 0.25 0.09 0.30 0.20 0.22 0.15 0.43 0.52 0.23 0.37 0.26
[33] 0.57 0.40 0.49 0.05 0.54 0.64 0.70 0.47 0.44 0.17 0.68 0.53 0.10 0.01 0.55 1.00
[49] 0.03 0.42 0.33 0.32 0.35 0.60 0.74 0.58 0.50 0.76 0.46 0.45 0.38 0.39 0.66 0.62
[65] 0.67 0.79 0.63 0.61 0.71 0.65 0.59 0.34 0.69 0.73 0.72 0.41 0.27 0.75 0.13 0.78
unique(data$residual.sugar)
[1] 1.90 2.60 2.30 1.80 1.60 1.20 2.00 6.10 3.80 3.90 1.70 4.40 2.40
[14] 1.40 2.50 10.70 5.50 2.10 1.50 5.90 2.80 2.20 3.00 3.40 5.10 4.65
[27] 1.30 7.30 7.20 2.90 2.70 5.60 3.10 3.20 3.30 3.60 4.00 7.00 6.40
[40] 3.50 11.00 3.65 4.50 4.80 2.95 5.80 6.20 4.20 7.90 3.70 6.70 6.60
[53] 2.15 5.20 2.55 15.50 4.10 8.30 6.55 4.60 4.30 5.15 6.30 6.00 8.60
[66] 7.50 2.25 4.25 2.85 3.45 2.35 2.65 9.00 8.80 5.00 1.65 2.05 0.90
[79] 8.90 8.10 4.70 1.75 7.80 12.90 13.40 5.40 15.40 3.75 13.80 5.70 13.90
unique(data$chlorides)
[1] 0.076 0.098 0.092 0.075 0.069 0.065 0.073 0.071 0.097 0.089 0.114 0.176 0.170
[14] 0.368 0.086 0.341 0.077 0.082 0.106 0.084 0.085 0.080 0.105 0.083 0.103 0.066
[27] 0.172 0.074 0.088 0.332 0.050 0.054 0.113 0.068 0.081 0.110 0.070 0.111 0.079
[40] 0.115 0.094 0.093 0.104 0.464 0.401 0.062 0.107 0.045 0.058 0.102 0.467 0.091
[53] 0.122 0.090 0.119 0.178 0.146 0.072 0.118 0.049 0.060 0.117 0.087 0.236 0.610
[66] 0.095 0.100 0.360 0.067 0.270 0.099 0.046 0.061 0.056 0.039 0.059 0.101 0.057
[79] 0.337 0.078 0.263 0.063 0.611 0.064 0.096 0.358 0.343 0.186 0.112 0.213 0.214
[92] 0.121 0.128 0.052 0.120 0.116 0.109 0.159 0.124 0.174 0.047 0.127 0.413 0.152
[105] 0.053 0.055 0.051 0.125 0.200 0.171 0.226 0.250 0.108 0.148 0.143 0.222 0.157
[118] 0.422 0.034 0.387 0.415 0.243 0.241 0.190 0.132 0.126 0.038 0.044 0.041 0.165
[131] 0.048 0.145 0.147 0.012 0.194 0.161 0.123 0.414 0.216 0.043 0.042 0.369 0.166
[144] 0.136 0.403 0.137 0.168 0.153 0.267 0.169 0.205 0.235 0.230
unique(data$free.sulfur.dioxide)
[1] 11.0 25.0 15.0 17.0 13.0 9.0 16.0 52.0 51.0 35.0 6.0 29.0 23.0 10.0 21.0 4.0
[17] 14.0 8.0 22.0 40.0 5.0 3.0 7.0 12.0 30.0 33.0 50.0 19.0 20.0 27.0 18.0 28.0
[33] 34.0 42.0 41.0 37.0 32.0 36.0 24.0 26.0 39.0 40.5 68.0 31.0 38.0 43.0 47.0 1.0
[49] 54.0 46.0 45.0 2.0 5.5 53.0 37.5 57.0 48.0 72.0 55.0 66.0
unique(data$total.sulfur.dioxide)
[1] 34.0 67.0 54.0 60.0 40.0 59.0 21.0 18.0 102.0 65.0 29.0 145.0 148.0
[14] 103.0 56.0 71.0 37.0 23.0 11.0 35.0 16.0 82.0 113.0 83.0 50.0 15.0
[27] 30.0 19.0 87.0 46.0 14.0 114.0 12.0 96.0 119.0 73.0 45.0 10.0 110.0
[40] 52.0 112.0 39.0 27.0 94.0 43.0 42.0 80.0 51.0 61.0 136.0 31.0 125.0
[53] 24.0 140.0 133.0 85.0 106.0 22.0 36.0 69.0 64.0 153.0 47.0 108.0 111.0
[66] 62.0 28.0 89.0 13.0 90.0 134.0 99.0 26.0 63.0 105.0 20.0 141.0 88.0
[79] 129.0 128.0 86.0 121.0 101.0 44.0 8.0 49.0 38.0 143.0 144.0 127.0 126.0
[92] 120.0 55.0 93.0 95.0 41.0 58.0 72.0 81.0 109.0 33.0 53.0 98.0 48.0
[105] 70.0 25.0 135.0 92.0 74.0 32.0 77.0 165.0 75.0 124.0 78.0 122.0 66.0
[118] 68.0 17.0 91.0 76.0 151.0 142.0 116.0 149.0 57.0 104.0 84.0 147.0 155.0
[131] 152.0 9.0 139.0 130.0 7.0 100.0 115.0 6.0 79.0 278.0 289.0 160.0 77.5
[144] 131.0
unique(data$density)
[1] 0.99780 0.99680 0.99700 0.99800 0.99640 0.99460 0.99590 0.99430 0.99740 0.99860
[11] 0.99690 0.99820 0.99660 0.99550 0.99620 0.99720 0.99580 0.99930 0.99570 0.99750
[21] 0.99400 0.99760 0.99340 0.99540 0.99710 0.99560 0.99830 0.99670 0.99610 0.99840
[31] 0.99380 0.99320 0.99650 0.99630 0.99600 0.99730 0.99880 0.99370 0.99520 0.99160
[41] 0.99440 0.99960 0.99500 0.99810 0.99530 0.99240 0.99480 0.99695 0.99545 0.99615
[51] 0.99940 0.99625 0.99585 0.99685 0.99655 0.99525 0.99815 0.99745 0.99270 0.99675
[61] 0.99925 0.99565 1.00005 0.99850 0.99965 0.99575 0.99990 1.00025 0.99870 0.99935
[71] 0.99735 0.99915 0.99910 1.00015 0.99970 1.00100 0.99790 1.00140 1.00010 0.99855
[81] 0.99845 0.99980 0.99645 0.99865 0.99890 0.99975 0.99900 1.00150 1.00020 0.99920
[91] 1.00080 1.00000 1.00060 1.00040 1.00180 0.99120 1.00220 1.00030 0.99490 0.99510
[101] 1.00320 0.99470 0.99950 0.99770 1.00260 1.00315 1.00210 0.99170 0.99220 0.99210
[111] 0.99788 1.00024 0.99768 0.99782 0.99761 0.99803 0.99785 0.99656 0.99488 0.99823
[121] 0.99779 0.99738 0.99701 0.99888 0.99938 0.99744 0.99668 0.99727 0.99586 0.99612
[131] 0.99676 0.99732 0.99814 0.99746 0.99708 0.99818 0.99639 0.99531 0.99786 0.99526
[141] 0.99641 0.99264 0.99682 0.99356 0.99386 0.99702 0.99693 0.99562 1.00012 0.99462
[151] 0.99939 0.99632 0.99976 0.99606 0.99154 0.99624 0.99417 0.99376 0.99832 0.99836
[161] 0.99694 0.99064 0.99672 0.99647 0.99736 0.99629 0.99689 0.99801 0.99652 0.99538
[171] 0.99594 0.99686 0.99438 0.99357 0.99628 0.99748 0.99578 0.99371 0.99522 0.99576
[181] 0.99552 0.99664 0.99614 0.99517 0.99787 0.99533 0.99536 0.99824 0.99577 0.99491
[191] 1.00289 0.99743 0.99774 0.99444 0.99892 0.99528 0.99331 0.99901 0.99674 0.99512
[201] 0.99395 0.99504 0.99516 0.99604 0.99468 0.99543 0.99791 0.99425 0.99509 0.99484
[211] 0.99834 0.99864 0.99498 0.99566 0.99408 0.99458 0.99648 0.99568 0.99613 0.99519
[221] 0.99518 0.99592 0.99654 0.99546 0.99554 0.99733 0.99669 0.99724 0.99643 0.99605
[231] 0.99658 0.99416 0.99712 0.99418 0.99596 0.99556 0.99918 0.99697 0.99378 0.99162
[241] 0.99495 0.99280 0.99603 0.99549 0.99722 0.99354 0.99635 0.99454 0.99598 0.99486
[251] 0.99007 0.99636 0.99642 0.99584 0.99506 0.99822 0.99364 0.99514 0.99854 0.99739
[261] 0.99683 0.99692 0.99756 0.99547 0.99859 0.99294 0.99634 0.99704 0.99258 0.99426
[271] 0.99747 0.99784 0.99358 0.99572 0.99769 0.99534 0.99817 0.99316 0.99471 0.99617
[281] 0.99529 0.99451 0.99479 0.99772 0.99666 0.99392 0.99388 0.99402 0.99360 0.99374
[291] 0.99523 0.99593 0.99396 0.99698 0.99020 0.99252 0.99256 0.99235 0.99352 0.99557
[301] 0.99394 0.99150 0.99379 0.99798 0.99341 0.99330 0.99684 0.99524 0.99764 0.99588
[311] 0.99473 0.99616 0.99622 0.99544 0.99728 0.99551 0.99434 0.99709 0.99384 0.99502
[321] 0.99667 0.99649 0.99716 0.99541 0.99318 0.99346 0.99599 0.99478 0.99754 0.99439
[331] 0.99633 0.99419 0.99878 0.99752 0.99428 0.99659 0.99677 0.99734 0.99678 0.99638
[341] 0.99922 0.99157 0.99718 0.99621 0.99242 0.99494 0.99729 0.99414 0.99721 0.99627
[351] 0.99569 0.99499 0.99437 0.99726 0.99456 0.99564 0.99080 0.99084 0.99350 0.99385
[361] 0.99688 0.99619 0.99476 0.99328 0.99286 0.99914 0.99521 0.99362 0.99558 0.99323
[371] 0.99191 0.99501 0.99290 0.99532 0.99796 0.99581 0.99608 0.99387 0.99448 0.99589
[381] 0.99852 0.99472 0.99587 0.99332 0.99464 0.99699 0.99725 0.99623 0.99609 0.99292
[391] 0.99420 1.00369 0.99713 0.99322 0.99706 0.99974 0.99467 0.99236 0.99705 0.99334
[401] 0.99336 1.00242 0.99182 0.99808 0.99828 0.99719 0.99542 0.99496 0.99344 0.99348
[411] 0.99459 0.99492 0.99508 0.99582 0.99555 0.99410 0.99661 0.99842 0.99489 0.99665
[421] 0.99553 0.99714 0.99631 0.99573 0.99717 0.99397 0.99646 0.99758 0.99306 0.99783
[431] 0.99765 0.99474 0.99483 0.99314 0.99574 0.99651
unique(data$pH)
[1] 3.51 3.20 3.26 3.16 3.30 3.39 3.36 3.35 3.28 3.58 3.17 3.11 3.38 3.04 3.52 3.43
[17] 3.34 3.47 3.46 3.45 3.40 3.42 3.23 3.50 3.33 3.21 3.48 3.90 3.25 3.32 3.15 3.41
[33] 3.44 3.31 3.54 3.13 2.93 3.14 3.75 3.85 3.29 3.08 3.37 3.19 3.07 3.49 3.53 3.24
[49] 3.63 3.22 3.68 2.74 3.59 3.00 3.12 3.57 3.61 3.06 3.60 3.69 3.10 3.05 3.67 3.27
[65] 3.18 3.02 3.55 2.99 3.01 3.56 3.03 3.62 2.88 2.95 2.98 3.09 2.86 3.74 2.92 3.72
[81] 2.87 2.89 2.94 3.66 3.71 3.78 3.70 4.01 2.90
unique(data$sulphates)
[1] 0.56 0.68 0.65 0.58 0.46 0.47 0.57 0.80 0.54 0.52 1.56 0.88 0.93 0.75 1.28 0.50
[17] 1.08 0.53 0.91 0.63 0.59 0.55 0.66 0.60 0.73 0.48 0.83 0.51 0.90 1.20 0.74 0.64
[33] 0.77 0.71 0.62 0.39 0.79 0.95 0.82 1.12 1.14 0.78 1.95 1.22 1.98 0.61 1.31 0.69
[49] 0.67 0.70 0.49 0.92 2.00 0.72 1.59 0.33 1.02 0.97 0.85 0.43 1.03 0.86 0.76 1.61
[65] 1.09 0.84 0.96 0.45 1.26 0.87 0.81 1.00 1.36 1.18 0.89 0.98 1.13 1.04 1.11 0.99
[81] 1.07 0.44 1.06 1.05 0.42 1.17 1.62 0.94 1.34 1.16 1.10 0.40 1.15 0.37 1.33 1.01
unique(data$alcohol)
[1] 9.400000 9.800000 10.000000 9.500000 10.500000 9.200000 9.900000 9.100000
[9] 9.300000 9.000000 9.700000 10.100000 10.600000 9.600000 10.800000 10.300000
[17] 13.100000 10.200000 10.900000 10.700000 12.900000 10.400000 13.000000 14.000000
[25] 11.500000 11.400000 12.400000 11.000000 12.200000 12.800000 12.600000 12.500000
[33] 11.700000 11.300000 12.300000 12.000000 11.900000 11.800000 8.700000 13.300000
[41] 11.200000 11.600000 11.100000 13.400000 12.100000 8.400000 12.700000 14.900000
[49] 13.200000 13.600000 13.500000 10.033333 9.550000 8.500000 11.066667 9.566667
[57] 10.550000 8.800000 13.566667 11.950000 9.950000 9.233333 9.250000 9.050000
[65] 10.750000
unique(data$quality)
[1] 5 6 7 4 8 3
data$label <- with(data, ifelse(quality >= 7, 'great',
ifelse(quality >= 5, 'good', 'poor')))
data$y <- with(data, ifelse(quality >= 7, 1, 0))
df <- data[,1:11]
cat <- data[,13]
lab <- data[,14]
pairs(df)
colMeans(data[,1:12])
fixed.acidity volatile.acidity citric.acid residual.sugar
8.31963727 0.52782051 0.27097561 2.53880550
chlorides free.sulfur.dioxide total.sulfur.dioxide density
0.08746654 15.87492183 46.46779237 0.99674668
pH sulphates alcohol quality
3.31111320 0.65814884 10.42298311 5.63602251
mvec <- colMeans(df) # sample mean vector
covM <- cov(df) # sample covariance matrix
corM <- cor(df) # sample correlation matrix
det(cov(df)) # generalized sample variance
[1] 0.00000000003478418
sum(diag(cov(df))) # total sample variance
[1] 1197.797
FindcrikChi <- function(n, p, alpha=0.5, N=1000){
cricvec <- rep(0, N) #vector for the rQ result collection#
for(i in 1:N){
#iteration to estimate rQ#
numvec <- rchisq(n, p) #generate a data set of size n, degree of freedom=p#
d <- sort(numvec)
q <- qchisq((1:n-0.5)/n, p)
cricvec[i] <- cor(d,q)
}
scricvec <- sort(cricvec)
cN <- ceiling(N* alpha) #to be on the safe side I use ceiling instead of floor(), take the 'worst' alpha*N cor as rQ, everything lower than that is deemed as rejection#
cricvalue <- scricvec[cN]
result <- list(cN, cricvalue, scricvec)
return(result)
}
critic <- FindcrikChi(n, p-1)
critic[[2]]
[1] 0.9993561
DensityPlots <- function(data_set){
for (col in names(data_set)){
print(mean(data_set[[col]]))
qqc <- qqnorm(data_set[[col]], main = paste("QQ - Plot: ", col))
corqq <- cor(qqc$x, qqc$y)
if (round(corqq,2) >= round(critic[[2]],3)){
qqline(data_set[[col]], col='blue', lwd=2)
print(paste('Data ', col, ' is Normally Distributed! with: ', round(corqq,3)))
} else {
qqline(data_set[[col]], col='orange', lwd=2)
print(paste('Data ', col, ' is NOT Normally Distributed! with: ', round(corqq,3)))
}
for ( i in 1:ncol(data_set)){
if (col != names(data_set[i])){
j <- names(data_set[i])
df_mean <- as.data.frame(colMeans(data_set[c(col, j)]))
plot <- ggplot(data = data_set) +
geom_point(mapping = aes(x = .data[[col]], y = .data[[j]])) +
geom_point(data=t(df_mean), mapping=aes(x = .data[[col]], y = .data[[j]]), col="red")
print(ggMarginal(plot, type="densigram"))
# or standard R
# plot(data_set[[col]], data_set[,i], col='blue', lwd=2, xlab=col, ylab=j)
# points(mean(data_set[[col]]), mean(data_set[,i]), col='red', lwd=8)
print(paste(col, ' vs ', names(data_set[i]), ': ',
cov(data_set[[col]], data_set[,i])))
}
}
}
}
DensityPlots(df)
[1] 8.319637
[1] "Data fixed.acidity is NOT Normally Distributed! with: 0.971"
[1] "fixed.acidity vs volatile.acidity : -0.0798514168351465"
[1] "fixed.acidity vs citric.acid : 0.227820003663115"
[1] "fixed.acidity vs residual.sugar : 0.281756262322901"
[1] "fixed.acidity vs chlorides : 0.0076786924869345"
[1] "fixed.acidity vs free.sulfur.dioxide : -2.8009214927039"
[1] "fixed.acidity vs total.sulfur.dioxide : -6.48234585758778"
[1] "fixed.acidity vs density : 0.00219522357567034"
[1] "fixed.acidity vs pH : -0.183585703596037"
[1] "fixed.acidity vs sulphates : 0.0540100915700598"
[1] "fixed.acidity vs alcohol : -0.114421153396092"
[1] 0.5278205
[1] "Data volatile.acidity is NOT Normally Distributed! with: 0.987"
[1] "volatile.acidity vs fixed.acidity : -0.0798514168351465"
[1] "volatile.acidity vs citric.acid : -0.01927162077597"
[1] "volatile.acidity vs residual.sugar : 0.000484190975899359"
[1] "volatile.acidity vs chlorides : 0.00051658691954687"
[1] "volatile.acidity vs free.sulfur.dioxide : -0.0196735903854177"
[1] "volatile.acidity vs total.sulfur.dioxide : 0.450425692371875"
[1] "volatile.acidity vs density : 0.00000744366515837123"
[1] "volatile.acidity vs pH : 0.0064946993036167"
[1] "volatile.acidity vs sulphates : -0.00792143384358653"
[1] "volatile.acidity vs alcohol : -0.0386002214306344"
[1] 0.2709756
[1] "Data citric.acid is NOT Normally Distributed! with: 0.977"
[1] "citric.acid vs fixed.acidity : 0.227820003663115"
[1] "citric.acid vs volatile.acidity : -0.01927162077597"
[1] "citric.acid vs residual.sugar : 0.0394342699716109"
[1] "citric.acid vs chlorides : 0.00186872477792362"
[1] "citric.acid vs free.sulfur.dioxide : -0.124252113922891"
[1] "citric.acid vs total.sulfur.dioxide : 0.227697274031564"
[1] "citric.acid vs density : 0.000134174581031167"
[1] "citric.acid vs pH : -0.0162975823437834"
[1] "citric.acid vs sulphates : 0.0103277145212003"
[1] "citric.acid vs alcohol : 0.0228151729295766"
[1] 2.538806
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.752"
[1] "residual.sugar vs fixed.acidity : 0.281756262322901"
[1] "residual.sugar vs volatile.acidity : 0.000484190975899359"
[1] "residual.sugar vs citric.acid : 0.0394342699716109"
[1] "residual.sugar vs chlorides : 0.00369017590390114"
[1] "residual.sugar vs free.sulfur.dioxide : 2.75861145224526"
[1] "residual.sugar vs total.sulfur.dioxide : 9.4164414789907"
[1] "residual.sugar vs density : 0.000945410861841846"
[1] "residual.sugar vs pH : -0.0186442889838064"
[1] "residual.sugar vs sulphates : 0.00132094135806093"
[1] "residual.sugar vs alcohol : 0.0632189597926113"
[1] 0.08746654
[1] "Data chlorides is NOT Normally Distributed! with: 0.695"
[1] "chlorides vs fixed.acidity : 0.0076786924869345"
[1] "chlorides vs volatile.acidity : 0.00051658691954687"
[1] "chlorides vs citric.acid : 0.00186872477792362"
[1] "chlorides vs residual.sugar : 0.00369017590390114"
[1] "chlorides vs free.sulfur.dioxide : 0.00273830307740836"
[1] "chlorides vs total.sulfur.dioxide : 0.0733867502451861"
[1] "chlorides vs density : 0.000017821756780873"
[1] "chlorides vs pH : -0.00192574495871559"
[1] "chlorides vs sulphates : 0.00296187794937543"
[1] "chlorides vs alcohol : -0.0110915177743286"
[1] 15.87492
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.95"
[1] "free.sulfur.dioxide vs fixed.acidity : -2.8009214927039"
[1] "free.sulfur.dioxide vs volatile.acidity : -0.0196735903854177"
[1] "free.sulfur.dioxide vs citric.acid : -0.124252113922891"
[1] "free.sulfur.dioxide vs residual.sugar : 2.75861145224526"
[1] "free.sulfur.dioxide vs chlorides : 0.00273830307740836"
[1] "free.sulfur.dioxide vs total.sulfur.dioxide : 229.737520947463"
[1] "free.sulfur.dioxide vs density : -0.000433250416209755"
[1] "free.sulfur.dioxide vs pH : 0.113653090831958"
[1] "free.sulfur.dioxide vs sulphates : 0.0915924709670703"
[1] "free.sulfur.dioxide vs alcohol : -0.773698400361301"
[1] 46.46779
[1] "Data total.sulfur.dioxide is NOT Normally Distributed! with: 0.934"
[1] "total.sulfur.dioxide vs fixed.acidity : -6.48234585758778"
[1] "total.sulfur.dioxide vs volatile.acidity : 0.450425692371875"
[1] "total.sulfur.dioxide vs citric.acid : 0.227697274031564"
[1] "total.sulfur.dioxide vs residual.sugar : 9.4164414789907"
[1] "total.sulfur.dioxide vs chlorides : 0.0733867502451861"
[1] "total.sulfur.dioxide vs free.sulfur.dioxide : 229.737520947463"
[1] "total.sulfur.dioxide vs density : 0.00442472714485978"
[1] "total.sulfur.dioxide vs pH : -0.337698792502511"
[1] "total.sulfur.dioxide vs sulphates : 0.239471004640729"
[1] "total.sulfur.dioxide vs alcohol : -7.20929789503922"
[1] 0.9967467
[1] "Data density is Normally Distributed! with: 0.995"
[1] "density vs fixed.acidity : 0.00219522357567034"
[1] "density vs volatile.acidity : 0.00000744366515837123"
[1] "density vs citric.acid : 0.000134174581031167"
[1] "density vs residual.sugar : 0.000945410861841846"
[1] "density vs chlorides : 0.000017821756780873"
[1] "density vs free.sulfur.dioxide : -0.000433250416209755"
[1] "density vs total.sulfur.dioxide : 0.00442472714485978"
[1] "density vs pH : -0.0000995639480166344"
[1] "density vs sulphates : 0.0000475096184959153"
[1] "density vs alcohol : -0.000997951789525837"
[1] 3.311113
[1] "Data pH is Normally Distributed! with: 0.997"
[1] "pH vs fixed.acidity : -0.183585703596037"
[1] "pH vs volatile.acidity : 0.0064946993036167"
[1] "pH vs citric.acid : -0.0162975823437834"
[1] "pH vs residual.sugar : -0.0186442889838064"
[1] "pH vs chlorides : -0.00192574495871559"
[1] "pH vs free.sulfur.dioxide : 0.113653090831958"
[1] "pH vs total.sulfur.dioxide : -0.337698792502511"
[1] "pH vs density : -0.0000995639480166344"
[1] "pH vs sulphates : -0.0051461858201426"
[1] "pH vs alcohol : 0.0338316166393107"
[1] 0.6581488
[1] "Data sulphates is NOT Normally Distributed! with: 0.912"
[1] "sulphates vs fixed.acidity : 0.0540100915700598"
[1] "sulphates vs volatile.acidity : -0.00792143384358653"
[1] "sulphates vs citric.acid : 0.0103277145212003"
[1] "sulphates vs residual.sugar : 0.00132094135806093"
[1] "sulphates vs chlorides : 0.00296187794937543"
[1] "sulphates vs free.sulfur.dioxide : 0.0915924709670703"
[1] "sulphates vs total.sulfur.dioxide : 0.239471004640729"
[1] "sulphates vs density : 0.0000475096184959153"
[1] "sulphates vs pH : -0.0051461858201426"
[1] "sulphates vs alcohol : 0.0169067772332677"
[1] 10.42298
[1] "Data alcohol is NOT Normally Distributed! with: 0.964"
[1] "alcohol vs fixed.acidity : -0.114421153396092"
[1] "alcohol vs volatile.acidity : -0.0386002214306344"
[1] "alcohol vs citric.acid : 0.0228151729295766"
[1] "alcohol vs residual.sugar : 0.0632189597926113"
[1] "alcohol vs chlorides : -0.0110915177743286"
[1] "alcohol vs free.sulfur.dioxide : -0.773698400361301"
[1] "alcohol vs total.sulfur.dioxide : -7.20929789503922"
[1] "alcohol vs density : -0.000997951789525837"
[1] "alcohol vs pH : 0.0338316166393107"
[1] "alcohol vs sulphates : 0.0169067772332677"
df_normal <- data.frame(matrix(nrow=n, ncol = 11))
colnames(df_normal) <- names(df)
normal <- c()
not_normal <- c()
for (col in names(df)){
tryCatch(
{
boxcoxTransc <- boxcox(df[[col]] ~ 1,lambda=seq(-2.5, 2.5,.01))
title(col)
flagidx <- which(boxcoxTransc$y==max(boxcoxTransc$y))
optlam <- boxcoxTransc$x[flagidx]
vec <- df[[col]]
transvec <- (vec^optlam-1)/optlam #according to (4-34)#
# transformed data#
qqts <- qqnorm(transvec, main = paste("QQ - Plot: ", col))
cortrans <- cor(qqts$x, qqts$y)
},
error = function(cond) {
message(paste("Data NOT transformed: ", col))
message("Here's the original error message:")
message(conditionMessage(cond))
# Choose a return value in case of error
qqts <- qqnorm(df[[col]], main = paste("QQ - Plot: ", col))
qqline(df[[col]], col='orange', lwd=2)
cortrans <- cor(qqts$x, qqts$y)
return(cortrans)
},
finally = {
if (round(cortrans, 2) >= round(critic[[2]], 3)){
normal <- append(normal, col)
qqline(transvec, col='blue', lwd=2)
print(paste('Data ', col, ' is Normally Distributed! with: ', round(cortrans,3)))
df_normal[[col]] <- transvec
} else {
not_normal <- append(not_normal, col)
qqline(transvec, col='orange', lwd=2)
print(paste('Data ', col, ' is NOT Normally Distributed! with: ',
round(cortrans,3)))
}
}
)
}
[1] "Data fixed.acidity is Normally Distributed! with: 0.997"
[1] "Data volatile.acidity is Normally Distributed! with: 0.998"
Data NOT transformed: citric.acid
Here's the original error message:
response variable must be positive
[1] "Data citric.acid is Normally Distributed! with: 0.998"
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.988"
[1] "Data chlorides is NOT Normally Distributed! with: 0.933"
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.994"
[1] "Data total.sulfur.dioxide is Normally Distributed! with: 0.995"
[1] "Data density is Normally Distributed! with: 0.996"
[1] "Data pH is Normally Distributed! with: 0.998"
[1] "Data sulphates is Normally Distributed! with: 0.998"
[1] "Data alcohol is NOT Normally Distributed! with: 0.985"
unique(df$citric.acid)
[1] 0.00 0.04 0.56 0.06 0.02 0.36 0.08 0.29 0.18 0.19 0.28 0.51 0.48 0.31 0.21 0.11
[17] 0.14 0.16 0.24 0.07 0.12 0.25 0.09 0.30 0.20 0.22 0.15 0.43 0.52 0.23 0.37 0.26
[33] 0.57 0.40 0.49 0.05 0.54 0.64 0.70 0.47 0.44 0.17 0.68 0.53 0.10 0.01 0.55 1.00
[49] 0.03 0.42 0.33 0.32 0.35 0.60 0.74 0.58 0.50 0.76 0.46 0.45 0.38 0.39 0.66 0.62
[65] 0.67 0.79 0.63 0.61 0.71 0.65 0.59 0.34 0.69 0.73 0.72 0.41 0.27 0.75 0.13 0.78
df_norm <- df_normal[normal]
df_norm <- within(df_norm, rm('citric.acid'))
pairs(df_norm)
DensityPlots(df_norm)
[1] 1.123393
[1] "Data fixed.acidity is Normally Distributed! with: 0.997"
[1] "fixed.acidity vs volatile.acidity : -0.00325578336388052"
[1] "fixed.acidity vs total.sulfur.dioxide : -0.00445079118867292"
[1] "fixed.acidity vs density : 0.0000620357104893582"
[1] "fixed.acidity vs pH : -0.00154673826556414"
[1] "fixed.acidity vs sulphates : 0.00316375032430066"
[1] -0.5907973
[1] "Data volatile.acidity is Normally Distributed! with: 0.998"
[1] "volatile.acidity vs fixed.acidity : -0.00325578336388052"
[1] "volatile.acidity vs total.sulfur.dioxide : 0.0201015425654002"
[1] "volatile.acidity vs density : 0.0000183855215118722"
[1] "volatile.acidity vs pH : 0.00269943316805013"
[1] "volatile.acidity vs sulphates : -0.0282823793958877"
[1] 3.960085
[1] "Data total.sulfur.dioxide is Normally Distributed! with: 0.995"
[1] "total.sulfur.dioxide vs fixed.acidity : -0.00445079118867292"
[1] "total.sulfur.dioxide vs volatile.acidity : 0.0201015425654002"
[1] "total.sulfur.dioxide vs density : 0.00016722798065971"
[1] "total.sulfur.dioxide vs pH : -0.000706287076722766"
[1] "total.sulfur.dioxide vs sulphates : 0.011014698090896"
[1] -0.003278254
[1] "Data density is Normally Distributed! with: 0.996"
[1] "density vs fixed.acidity : 0.0000620357104893582"
[1] "density vs volatile.acidity : 0.0000183855215118722"
[1] "density vs total.sulfur.dioxide : 0.00016722798065971"
[1] "density vs pH : -0.0000292094469761482"
[1] "density vs sulphates : 0.000115258678596354"
[1] 1.17496
[1] "Data pH is Normally Distributed! with: 0.998"
[1] "pH vs fixed.acidity : -0.00154673826556414"
[1] "pH vs volatile.acidity : 0.00269943316805013"
[1] "pH vs total.sulfur.dioxide : -0.000706287076722766"
[1] "pH vs density : -0.0000292094469761482"
[1] "pH vs sulphates : -0.0017818484264631"
[1] -0.6092693
[1] "Data sulphates is Normally Distributed! with: 0.998"
[1] "sulphates vs fixed.acidity : 0.00316375032430066"
[1] "sulphates vs volatile.acidity : -0.0282823793958877"
[1] "sulphates vs total.sulfur.dioxide : 0.011014698090896"
[1] "sulphates vs density : 0.000115258678596354"
[1] "sulphates vs pH : -0.0017818484264631"
df_to_scale <- df[not_normal]
df_to_scale$citric.acid <- df$citric.acid
scale_data <- as.data.frame(scale(df_to_scale))
pairs(scale_data)
DensityPlots(scale_data)
[1] -0.0000000000000001156003
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.752"
[1] "residual.sugar vs chlorides : 0.0556095352035322"
[1] "residual.sugar vs free.sulfur.dioxide : 0.187048995104287"
[1] "residual.sugar vs alcohol : 0.0420754372097311"
[1] "residual.sugar vs citric.acid : 0.143577161570314"
[1] 0.00000000000000008613634
[1] "Data chlorides is NOT Normally Distributed! with: 0.695"
[1] "chlorides vs residual.sugar : 0.0556095352035322"
[1] "chlorides vs free.sulfur.dioxide : 0.00556214700478112"
[1] "chlorides vs alcohol : -0.221140544788283"
[1] "chlorides vs citric.acid : 0.203822913829042"
[1] -0.00000000000000005600528
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.95"
[1] "free.sulfur.dioxide vs residual.sugar : 0.187048995104287"
[1] "free.sulfur.dioxide vs chlorides : 0.00556214700478112"
[1] "free.sulfur.dioxide vs alcohol : -0.0694083535649999"
[1] "free.sulfur.dioxide vs citric.acid : -0.0609781291923049"
[1] 0.00000000000000008786086
[1] "Data alcohol is NOT Normally Distributed! with: 0.964"
[1] "alcohol vs residual.sugar : 0.0420754372097311"
[1] "alcohol vs chlorides : -0.221140544788283"
[1] "alcohol vs free.sulfur.dioxide : -0.0694083535649999"
[1] "alcohol vs citric.acid : 0.109903246641567"
[1] -0.00000000000000009207575
[1] "Data citric.acid is NOT Normally Distributed! with: 0.977"
[1] "citric.acid vs residual.sugar : 0.143577161570314"
[1] "citric.acid vs chlorides : 0.203822913829042"
[1] "citric.acid vs free.sulfur.dioxide : -0.0609781291923049"
[1] "citric.acid vs alcohol : 0.109903246641567"
log_scale <- log(df_to_scale)
pairs(log_scale)
DensityPlots(log_scale)
[1] 0.8502318
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.925"
[1] "residual.sugar vs chlorides : 0.0136963986844303"
[1] "residual.sugar vs free.sulfur.dioxide : 0.0229861717575993"
[1] "residual.sugar vs alcohol : 0.00281540290773283"
[1] "residual.sugar vs citric.acid : NaN"
[1] -2.505462
[1] "Data chlorides is NOT Normally Distributed! with: 0.91"
[1] "chlorides vs residual.sugar : 0.0136963986844303"
[1] "chlorides vs free.sulfur.dioxide : -0.00304888609384212"
[1] "chlorides vs alcohol : -0.00989314903575179"
[1] "chlorides vs citric.acid : NaN"
[1] 2.546132
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.992"
[1] "free.sulfur.dioxide vs residual.sugar : 0.0229861717575993"
[1] "free.sulfur.dioxide vs chlorides : -0.00304888609384212"
[1] "free.sulfur.dioxide vs alcohol : -0.00570807621737924"
[1] "free.sulfur.dioxide vs citric.acid : NaN"
[1] 2.339021
[1] "Data alcohol is NOT Normally Distributed! with: 0.973"
[1] "alcohol vs residual.sugar : 0.00281540290773283"
[1] "alcohol vs chlorides : -0.00989314903575179"
[1] "alcohol vs free.sulfur.dioxide : -0.00570807621737924"
[1] "alcohol vs citric.acid : NaN"
[1] -Inf
Error in plot.window(...) : need finite 'ylim' values
process <- preProcess(df_to_scale, method=c("range"))
norm_scale <- predict(process, df_to_scale)
pairs(norm_scale)
DensityPlots(norm_scale)
[1] 0.112247
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.752"
[1] "residual.sugar vs chlorides : 0.000421956217428721"
[1] "residual.sugar vs free.sulfur.dioxide : 0.00266121112506778"
[1] "residual.sugar vs alcohol : 0.000666163959879993"
[1] "residual.sugar vs citric.acid : 0.00270097739531581"
[1] 0.1259875
[1] "Data chlorides is NOT Normally Distributed! with: 0.695"
[1] "chlorides vs residual.sugar : 0.000421956217428721"
[1] "chlorides vs free.sulfur.dioxide : 0.0000643867261729257"
[1] "chlorides vs alcohol : -0.00284872679448532"
[1] "chlorides vs citric.acid : 0.00311974086464712"
[1] 0.2095059
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.95"
[1] "free.sulfur.dioxide vs residual.sugar : 0.00266121112506778"
[1] "free.sulfur.dioxide vs chlorides : 0.0000643867261729257"
[1] "free.sulfur.dioxide vs alcohol : -0.00167648624130293"
[1] "free.sulfur.dioxide vs citric.acid : -0.00175002977356185"
[1] 0.3112282
[1] "Data alcohol is NOT Normally Distributed! with: 0.964"
[1] "alcohol vs residual.sugar : 0.000666163959879993"
[1] "alcohol vs chlorides : -0.00284872679448532"
[1] "alcohol vs free.sulfur.dioxide : -0.00167648624130293"
[1] "alcohol vs citric.acid : 0.00351002660455024"
[1] 0.2709756
[1] "Data citric.acid is NOT Normally Distributed! with: 0.977"
[1] "citric.acid vs residual.sugar : 0.00270097739531581"
[1] "citric.acid vs chlorides : 0.00311974086464712"
[1] "citric.acid vs free.sulfur.dioxide : -0.00175002977356185"
[1] "citric.acid vs alcohol : 0.00351002660455024"
df_stndardized <- df_to_scale
for (col in names(df_to_scale)){
df_stndardized[[col]] <- (df_to_scale[[col]] - mean(df_to_scale[[col]])) / sd(df_to_scale[[col]])
}
pairs(df_stndardized)
DensityPlots(df_stndardized)
[1] -0.0000000000000001156003
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.752"
[1] "residual.sugar vs chlorides : 0.0556095352035322"
[1] "residual.sugar vs free.sulfur.dioxide : 0.187048995104287"
[1] "residual.sugar vs alcohol : 0.0420754372097311"
[1] "residual.sugar vs citric.acid : 0.143577161570314"
[1] 0.00000000000000008888973
[1] "Data chlorides is NOT Normally Distributed! with: 0.695"
[1] "chlorides vs residual.sugar : 0.0556095352035322"
[1] "chlorides vs free.sulfur.dioxide : 0.00556214700478112"
[1] "chlorides vs alcohol : -0.221140544788283"
[1] "chlorides vs citric.acid : 0.203822913829042"
[1] -0.00000000000000005600528
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.95"
[1] "free.sulfur.dioxide vs residual.sugar : 0.187048995104287"
[1] "free.sulfur.dioxide vs chlorides : 0.00556214700478112"
[1] "free.sulfur.dioxide vs alcohol : -0.0694083535649999"
[1] "free.sulfur.dioxide vs citric.acid : -0.0609781291923049"
[1] 0.00000000000000008080805
[1] "Data alcohol is NOT Normally Distributed! with: 0.964"
[1] "alcohol vs residual.sugar : 0.0420754372097311"
[1] "alcohol vs chlorides : -0.221140544788283"
[1] "alcohol vs free.sulfur.dioxide : -0.0694083535649999"
[1] "alcohol vs citric.acid : 0.109903246641567"
[1] -0.00000000000000009207575
[1] "Data citric.acid is NOT Normally Distributed! with: 0.977"
[1] "citric.acid vs residual.sugar : 0.143577161570314"
[1] "citric.acid vs chlorides : 0.203822913829042"
[1] "citric.acid vs free.sulfur.dioxide : -0.0609781291923049"
[1] "citric.acid vs alcohol : 0.109903246641567"
chiforbi <- FindcrikChi(n, 2)
BivariateQQ <- function(data_set){
for (col in names(data_set)){
for ( i in 1:ncol(data_set)){
if (col != names(data_set[i])){
dat <- c(col, names(data_set[i]))
X <- (data_set[dat])
mu <- colMeans(data_set[dat])
S <- cov(data_set[dat])
result <- c()
tryCatch(
{
for (row in 1:nrow(X)){
v <- as.matrix(X[row,])
result[row] <- mahalanobis(v, mu, S)
#result[row] <- (v-mu)%*%solve(S)%*%t(v-mu)
y <- sort(result)
# The second parameter is now '2'
# because we have only two variables (bivariate)
x <- qchisq(1:length(result)/(length(result)+1), 2)
}
plot(x, y)
if (round(cor(x, y),2) >= round(chiforbi[[2]],3)){
abline(0,1, col='blue', lwd=2)
print(paste(col, ' vs ', names(data_set[i]),
' is Normally Distributed! with: ', round(cor(x, y),3)))
} else {
abline(0,1, col='orange', lwd=2)
print(paste(col, ' vs ', names(data_set[i]),
' is NOT Normally Distributed! with: ', round(cor(x, y),3)))
}
},
error = function(cond) {
message(paste("Data NOT calculated: ", col, names(data_set[i])))
message("Here's the original error message:")
message(conditionMessage(cond))
# Choose a return value in case of error
NA
})
}
}
}
}
round(chiforbi[[2]],3)
[1] 0.999
BivariateQQ(df_norm)
[1] "fixed.acidity vs volatile.acidity is Normally Distributed! with: 0.996"
[1] "fixed.acidity vs total.sulfur.dioxide is Normally Distributed! with: 0.996"
[1] "fixed.acidity vs density is NOT Normally Distributed! with: 0.989"
[1] "fixed.acidity vs pH is NOT Normally Distributed! with: 0.985"
[1] "fixed.acidity vs sulphates is Normally Distributed! with: 0.996"
[1] "volatile.acidity vs fixed.acidity is Normally Distributed! with: 0.996"
[1] "volatile.acidity vs total.sulfur.dioxide is NOT Normally Distributed! with: 0.99"
[1] "volatile.acidity vs density is NOT Normally Distributed! with: 0.993"
[1] "volatile.acidity vs pH is NOT Normally Distributed! with: 0.991"
[1] "volatile.acidity vs sulphates is NOT Normally Distributed! with: 0.992"
[1] "total.sulfur.dioxide vs fixed.acidity is Normally Distributed! with: 0.996"
[1] "total.sulfur.dioxide vs volatile.acidity is NOT Normally Distributed! with: 0.99"
[1] "total.sulfur.dioxide vs density is NOT Normally Distributed! with: 0.988"
[1] "total.sulfur.dioxide vs pH is NOT Normally Distributed! with: 0.992"
[1] "total.sulfur.dioxide vs sulphates is NOT Normally Distributed! with: 0.988"
[1] "density vs fixed.acidity is NOT Normally Distributed! with: 0.989"
[1] "density vs volatile.acidity is NOT Normally Distributed! with: 0.993"
[1] "density vs total.sulfur.dioxide is NOT Normally Distributed! with: 0.988"
[1] "density vs pH is NOT Normally Distributed! with: 0.968"
[1] "density vs sulphates is NOT Normally Distributed! with: 0.992"
[1] "pH vs fixed.acidity is NOT Normally Distributed! with: 0.985"
[1] "pH vs volatile.acidity is NOT Normally Distributed! with: 0.991"
[1] "pH vs total.sulfur.dioxide is NOT Normally Distributed! with: 0.992"
[1] "pH vs density is NOT Normally Distributed! with: 0.968"
[1] "pH vs sulphates is NOT Normally Distributed! with: 0.985"
[1] "sulphates vs fixed.acidity is Normally Distributed! with: 0.996"
[1] "sulphates vs volatile.acidity is NOT Normally Distributed! with: 0.992"
[1] "sulphates vs total.sulfur.dioxide is NOT Normally Distributed! with: 0.988"
[1] "sulphates vs density is NOT Normally Distributed! with: 0.992"
[1] "sulphates vs pH is NOT Normally Distributed! with: 0.985"
df2 <- cbind(df_norm, df_to_scale, data[, 13:14])
df_grate <- df2[df2$label=='great', ]
df_good <- df2[df2$label=='good', ]
df_poor <- df2[df2$label=='poor', ]
df_1 <- df2[df2$y== 1, ]
df_0 <- df2[df2$y== 0, ]
Trying to calculate Box’s M Test manually
n1 <- nrow(df_grate)
n2 <- nrow(df_good)
n3 <- nrow(df_poor)
n4 <- nrow(df_1)
n5 <- nrow(df_0)
m1 <- colMeans(df_grate[,1:11])
m2 <- colMeans(df_good[,1:11])
m3 <- colMeans(df_poor[,1:11])
m4 <- colMeans(df_1[,1:11])
m5 <- colMeans(df_0[,1:11])
s1 <- cov(df_grate[,1:11])
s2 <- cov(df_good[,1:11])
s3 <- cov(df_poor[,1:11])
s4 <- cov(df_1[,1:11])
s5 <- cov(df_0[,1:11])
sp <- ((n1-1)*s1+(n2-1)*s2+(n3-1)*s3)/(n1+n2+n3-3) #Spooled is HERE#
spi <- solve(sp)
spi
fixed.acidity volatile.acidity total.sulfur.dioxide
fixed.acidity 3110.857806 7.3185965 24.2149922
volatile.acidity 7.318596 28.0303581 -1.7038937
total.sulfur.dioxide 24.214992 -1.7038937 3.4452423
density -55207.110776 -1399.9130816 -319.3604423
pH 1638.546160 9.2556597 12.2910877
sulphates 29.824769 4.4528734 -0.3076922
residual.sugar 22.163691 0.2093982 0.1021050
chlorides 278.462380 -30.8393005 4.6354535
free.sulfur.dioxide -1.073089 0.1036913 -0.1898212
alcohol -36.728423 -1.2446720 0.2364624
citric.acid -126.463812 25.3255169 -3.5068357
density pH sulphates residual.sugar
fixed.acidity -55207.11078 1638.5461603 29.82476937 22.16369060
volatile.acidity -1399.91308 9.2556597 4.45287343 0.20939815
total.sulfur.dioxide -319.36044 12.2910877 -0.30769222 0.10210499
density 1671992.60547 -29088.5007914 -1414.61288769 -705.46278416
pH -29088.50079 1673.6982443 7.71877174 12.09372726
sulphates -1414.61289 7.7187717 12.88993094 0.69844840
residual.sugar -705.46278 12.0937273 0.69844840 0.84517595
chlorides -1677.20780 243.3817781 -26.14102361 0.18083640
free.sulfur.dioxide 26.39205 -0.7273122 -0.02061248 -0.02557939
alcohol 1361.28259 -27.9828590 -1.62791964 -0.62976077
citric.acid -983.33987 39.3121655 -0.44452752 -0.30768621
chlorides free.sulfur.dioxide alcohol citric.acid
fixed.acidity 278.4623803 -1.073088817 -36.728423413 -126.4638122
volatile.acidity -30.8393005 0.103691339 -1.244671969 25.3255169
total.sulfur.dioxide 4.6354535 -0.189821166 0.236462372 -3.5068357
density -1677.2077972 26.392049897 1361.282585942 -983.3398690
pH 243.3817781 -0.727312174 -27.982858994 39.3121655
sulphates -26.1410236 -0.020612480 -1.627919637 -0.4445275
residual.sugar 0.1808364 -0.025579391 -0.629760767 -0.3076862
chlorides 622.7774938 -0.168382670 4.727317172 -54.3304842
free.sulfur.dioxide -0.1683827 0.020594188 0.005806883 0.1767275
alcohol 4.7273172 0.005806883 2.435059048 -3.1048161
citric.acid -54.3304842 0.176727524 -3.104816121 80.3449869
sp_2 <- ((n4-1)*s4+(n5-1)*s5)/(n4+n5-2) #Spooled is HERE#
spi_2 <- solve(sp_2)
spi_2
fixed.acidity volatile.acidity total.sulfur.dioxide
fixed.acidity 3108.141661 5.5979305 24.5672927
volatile.acidity 5.597930 27.4099149 -1.5802897
total.sulfur.dioxide 24.567293 -1.5802897 3.4230392
density -55108.160031 -1351.4058851 -329.2106435
pH 1631.987245 6.4562783 12.8469440
sulphates 30.309543 4.6280245 -0.3415658
residual.sugar 22.065924 0.1682393 0.1102374
chlorides 275.882362 -31.8772479 4.8374055
free.sulfur.dioxide -1.075234 0.1032118 -0.1898337
alcohol -36.595349 -1.1877300 0.2253312
citric.acid -127.562063 24.9645491 -3.4353962
density pH sulphates residual.sugar
fixed.acidity -55108.16003 1631.987245 30.30954341 22.06592401
volatile.acidity -1351.40589 6.456278 4.62802455 0.16823933
total.sulfur.dioxide -329.21064 12.846944 -0.34156577 0.11023737
density 1669217.66640 -28889.593985 -1428.84070457 -702.70878282
pH -28889.59399 1662.413129 8.48144409 11.91976107
sulphates -1428.84070 8.481444 12.85144297 0.71004077
residual.sugar -702.70878 11.919761 0.71004077 0.84303333
chlorides -1599.41289 239.055592 -25.88221530 0.11502708
free.sulfur.dioxide 26.45073 -0.730162 -0.02047829 -0.02563066
alcohol 1357.66761 -27.746601 -1.64453352 -0.62641975
citric.acid -954.78733 37.679928 -0.34300162 -0.33226680
chlorides free.sulfur.dioxide alcohol citric.acid
fixed.acidity 275.8823621 -1.075233605 -36.595348932 -127.5620633
volatile.acidity -31.8772479 0.103211799 -1.187730031 24.9645491
total.sulfur.dioxide 4.8374055 -0.189833700 0.225331221 -3.4353962
density -1599.4128909 26.450733628 1357.667611230 -954.7873300
pH 239.0555920 -0.730161982 -27.746601290 37.6799281
sulphates -25.8822153 -0.020478293 -1.644533522 -0.3430016
residual.sugar 0.1150271 -0.025630657 -0.626419754 -0.3322668
chlorides 621.5413949 -0.169357525 4.822436338 -54.9661725
free.sulfur.dioxide -0.1693575 0.020606627 0.005859784 0.1765166
alcohol 4.8224363 0.005859784 2.431362556 -3.0726683
citric.acid -54.9661725 0.176516643 -3.072668332 80.1727517
box_m(df2[,1:11],df2[,"label"])
box_m(df2[,1:11],df2[,"y"])
df2$label <- with(df2, ifelse(label == 'great', 2,
ifelse(label == 'good', 1, 0)))
sample <- sample.split(df2, SplitRatio = 0.7)
train <- subset(df2, sample == TRUE)
test <- subset(df2, sample == FALSE)
multi_model <- multinom(label ~ ., data = train[, 1:12])
# weights: 39 (24 variable)
initial value 1216.163804
iter 10 value 472.679801
iter 20 value 433.492271
iter 30 value 429.718911
iter 40 value 429.634205
iter 50 value 429.363982
iter 60 value 429.357113
iter 70 value 429.292646
iter 80 value 429.133151
iter 90 value 429.037024
final value 428.984627
converged
pred_multi <- predict(multi_model, newdata = test[, 1:12], type = "class")
correct_predictions <- sum(pred_multi == test$label)
correct_predictions
[1] 418
xtab <- table(pred_multi, test$label)
cm <- caret::confusionMatrix(xtab)
cm
Confusion Matrix and Statistics
pred_multi 0 1 2
0 2 1 0
1 22 390 40
2 0 11 26
Overall Statistics
Accuracy : 0.8496
95% CI : (0.8149, 0.88)
No Information Rate : 0.8171
P-Value [Acc > NIR] : 0.033
Kappa : 0.3706
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 0 Class: 1 Class: 2
Sensitivity 0.083333 0.9701 0.39394
Specificity 0.997863 0.3111 0.97418
Pos Pred Value 0.666667 0.8628 0.70270
Neg Pred Value 0.955010 0.7000 0.91209
Prevalence 0.048780 0.8171 0.13415
Detection Rate 0.004065 0.7927 0.05285
Detection Prevalence 0.006098 0.9187 0.07520
Balanced Accuracy 0.540598 0.6406 0.68406
# Accuracy = TP / TOTAL
print(paste('Accuracy: ', (397 + 21) / 492 ))
[1] "Accuracy: 0.849593495934959"
# Recall = TP / (TP + FN)
Metrics::recall(pred_multi, test$label)
[1] 1.039823
# Precision = TP / (TP + FP)
Metrics::precision(pred_multi, test$label)
Warning: argument is not numeric or logical: returning NA
[1] NA
# F1 = 2 * (Precision * Recall) / (Precision + Recall)
Metrics::f1(pred_multi, test$label)
[1] 1
#𝐸(APER)
aer(test$label, pred_multi)
[1] 0.1504065
model_3 <- lm(label ~ ., data=train[, 1:12])
summary(model_3)
Call:
lm(formula = label ~ ., data = train[, 1:12])
Residuals:
Min 1Q Median 3Q Max
-1.37625 -0.18046 -0.01671 0.10944 1.07897
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.6000853 1.0768373 -0.557 0.57746
fixed.acidity 1.1822355 0.5878335 2.011 0.04455 *
volatile.acidity -0.2736555 0.0559587 -4.890 0.0000011573 ***
total.sulfur.dioxide 0.0026437 0.0195225 0.135 0.89231
density -36.3624186 13.6066875 -2.672 0.00764 **
pH -0.4936659 0.4395770 -1.123 0.26166
sulphates 0.2218315 0.0385376 5.756 0.0000000112 ***
residual.sugar 0.0189798 0.0100491 1.889 0.05920 .
chlorides -0.8439080 0.2572084 -3.281 0.00107 **
free.sulfur.dioxide 0.0009161 0.0014820 0.618 0.53658
alcohol 0.0773923 0.0165226 4.684 0.0000031658 ***
citric.acid 0.0048825 0.0979998 0.050 0.96027
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3539 on 1095 degrees of freedom
Multiple R-squared: 0.2324, Adjusted R-squared: 0.2247
F-statistic: 30.13 on 11 and 1095 DF, p-value: < 0.00000000000000022
plot(model_3, pch=df2$label)
pred_train_3 <- predict(model_3, train[, 1:12], type="response")
pred_test_3 <- predict(model_3, newdata = test[, 1:12], type="response")
train_TAB <- table(train$label, pred_train_3 > 0.95)
train_TAB
FALSE TRUE
0 26 13
1 234 683
2 2 149
test_TAB <- table(test$label, pred_test_3 > 0.95)
test_TAB
FALSE TRUE
0 13 11
1 116 286
2 1 65
model_binorm <- glm(y ~., data = train[, -12], family = binomial)
predictions <- predict(model_binorm, newdata = test[, -12], type = "response")
predicted_classes <- ifelse(predictions > 0.5, 0, 1)
mean(predicted_classes == test$y)
[1] 0.101626
xtab <- table(predicted_classes, test$y)
cm <- caret::confusionMatrix(xtab)
cm
Confusion Matrix and Statistics
predicted_classes 0 1
0 12 28
1 414 38
Accuracy : 0.1016
95% CI : (0.0764, 0.1318)
No Information Rate : 0.8659
P-Value [Acc > NIR] : 1
Kappa : -0.1141
Mcnemar's Test P-Value : <0.0000000000000002
Sensitivity : 0.02817
Specificity : 0.57576
Pos Pred Value : 0.30000
Neg Pred Value : 0.08407
Prevalence : 0.86585
Detection Rate : 0.02439
Detection Prevalence : 0.08130
Balanced Accuracy : 0.30196
'Positive' Class : 0
# Accuracy = TP / TOTAL
print(paste('Accuracy: ', (5 + 46) / 492 ))
[1] "Accuracy: 0.103658536585366"
# Recall = TP / (TP + FN)
Metrics::recall(predicted_classes, test$y)
[1] 0.0840708
# Precision = TP / (TP + FP)
Metrics::precision(predicted_classes, test$y)
[1] 0.5757576
# F1 = 2 * (Precision * Recall) / (Precision + Recall)
Metrics::f1(predicted_classes, test$y)
[1] 1
#𝐸(APER)
aer(test$y, predicted_classes)
[1] 0.898374
#ROC -curve
roc_curve <- roc(ifelse(test$y == 0, 1, 0), ifelse(predicted_classes == 0, 1, 0))
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot ROC curve
plot(roc_curve, main = "ROC Curve", col = "blue")
fit <- vglm(label~., family=multinomial, data=train[, 1:12])
summary(fit)
Call:
vglm(formula = label ~ ., family = multinomial, data = train[,
1:12])
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 9.97473 23.73357 0.420 0.67428
(Intercept):2 14.48637 10.24861 NA NA
fixed.acidity:1 -10.84411 12.74768 NA NA
fixed.acidity:2 -7.41820 5.44709 -1.362 0.17324
volatile.acidity:1 4.98964 1.09241 4.568 0.00000493486 ***
volatile.acidity:2 1.73671 0.58314 2.978 0.00290 **
total.sulfur.dioxide:1 -0.12719 0.42767 -0.297 0.76616
total.sulfur.dioxide:2 0.66657 0.21912 3.042 0.00235 **
density:1 220.05204 281.97922 0.780 0.43516
density:2 186.46231 125.54618 1.485 0.13749
pH:1 10.03474 9.72568 1.032 0.30218
pH:2 2.07596 4.37669 0.474 0.63527
sulphates:1 -3.26663 0.78683 -4.152 0.00003300801 ***
sulphates:2 -2.42826 0.41788 -5.811 0.00000000621 ***
residual.sugar:1 0.05203 0.16953 0.307 0.75892
residual.sugar:2 -0.21608 0.08763 -2.466 0.01367 *
chlorides:1 14.64314 4.87659 3.003 0.00268 **
chlorides:2 6.46876 3.59378 1.800 0.07186 .
free.sulfur.dioxide:1 -0.05374 0.03703 -1.451 0.14668
free.sulfur.dioxide:2 -0.02146 0.01543 -1.391 0.16428
alcohol:1 -0.91544 0.33868 NA NA
alcohol:2 -0.77766 0.15623 -4.978 0.00000064344 ***
citric.acid:1 0.51073 2.00315 0.255 0.79875
citric.acid:2 0.50078 1.00098 0.500 0.61687
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
Residual deviance: 856.9503 on 2190 degrees of freedom
Log-likelihood: -428.4752 on 2190 degrees of freedom
Number of Fisher scoring iterations: 7
Warning: Hauck-Donner effect detected in the following estimate(s):
'(Intercept):2', 'fixed.acidity:1', 'alcohol:1'
Reference group is level 3 of the response
probabilities <- predict(fit, test[,1:12], type="response")
predictions <- apply(probabilities, 1, which.max)
predictions[which(predictions==0)] <- test$label == 0
predictions[which(predictions==1)] <- test$label == 1
Warning: number of items to replace is not a multiple of replacement length
predictions[which(predictions==3)] <- test$label == 2
Warning: number of items to replace is not a multiple of replacement length
# summarize accuracy
xtab_v <- table(predictions, test$label)
cm <- caret::confusionMatrix(xtab_v)
cm
Confusion Matrix and Statistics
predictions 0 1 2
0 0 11 28
1 2 2 0
2 22 389 38
Overall Statistics
Accuracy : 0.0813
95% CI : (0.0587, 0.1091)
No Information Rate : 0.8171
P-Value [Acc > NIR] : 1
Kappa : -0.0595
Mcnemar's Test P-Value : <0.0000000000000002
Statistics by Class:
Class: 0 Class: 1 Class: 2
Sensitivity 0.00000 0.004975 0.57576
Specificity 0.91667 0.977778 0.03521
Pos Pred Value 0.00000 0.500000 0.08463
Neg Pred Value 0.94702 0.180328 0.34884
Prevalence 0.04878 0.817073 0.13415
Detection Rate 0.00000 0.004065 0.07724
Detection Prevalence 0.07927 0.008130 0.91260
Balanced Accuracy 0.45833 0.491376 0.30548
# Accuracy = TP / TOTAL
print(paste('Accuracy: ', 47 / 492 ))
[1] "Accuracy: 0.0955284552845529"
# Recall = TP / (TP + FN)
Metrics::recall(predictions, test$labeæ)
Warning: argument is not numeric or logical: returning NA
[1] NA
# Precision = TP / (TP + FP)
Metrics::precision(predictions, test$label)
[1] 1.940299
# F1 = 2 * (Precision * Recall) / (Precision + Recall)
Metrics::f1(predictions, test$label)
[1] 1
#𝐸(APER)
aer(test$label, predictions)
[1] 0.9186992
num_data <- df2[, 1:11]
normalized <- scale(num_data)
#After that we create a correlation matrix
corr_matrix <- cor(normalized)
ggcorrplot(corr_matrix)
#Now we make a pca
pca <- prcomp(num_data, scale=T)
summary(pca)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 1.7614 1.4084 1.2645 1.04443 0.9928 0.85854 0.7556 0.62197
Proportion of Variance 0.2821 0.1803 0.1454 0.09917 0.0896 0.06701 0.0519 0.03517
Cumulative Proportion 0.2821 0.4624 0.6078 0.70693 0.7965 0.86353 0.9154 0.95060
PC9 PC10 PC11
Standard deviation 0.5449 0.42592 0.25491
Proportion of Variance 0.0270 0.01649 0.00591
Cumulative Proportion 0.9776 0.99409 1.00000
data.pca <- princomp(corr_matrix)
summary(data.pca)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
Standard deviation 0.8915682 0.5626588 0.4458347 0.3287614 0.24273993 0.22153175
Proportion of Variance 0.5090581 0.2027442 0.1272934 0.0692181 0.03773467 0.03142897
Cumulative Proportion 0.5090581 0.7118023 0.8390956 0.9083137 0.94604842 0.97747739
Comp.7 Comp.8 Comp.9 Comp.10
Standard deviation 0.15155186 0.089660453 0.054877700 0.0339192702
Proportion of Variance 0.01470892 0.005148255 0.001928635 0.0007368026
Cumulative Proportion 0.99218631 0.997334563 0.999263197 1.0000000000
Comp.11
Standard deviation 0.00000000533722824842028
Proportion of Variance 0.00000000000000001824273
Cumulative Proportion 1.00000000000000000000000
data.pca$loadings[, 1:2]
Comp.1 Comp.2
fixed.acidity 0.50837145 0.03432755
volatile.acidity -0.27437604 0.34818335
total.sulfur.dioxide -0.08619193 0.42217508
density 0.36088060 0.34221286
pH -0.46124636 -0.11623776
sulphates 0.19289017 -0.22326201
residual.sugar 0.07676416 0.13981543
chlorides 0.16932322 0.15362777
free.sulfur.dioxide -0.13509571 0.31081409
alcohol -0.11698193 -0.58865323
citric.acid 0.46060706 -0.18061985
pca_1_2 <-data.pca$loadings[, 1:2]
pca_1_2<-as.matrix(pca_1_2)
numerical_data<-as.matrix(num_data)
#we multiply the numerical_data with our first and second pricipal components
reduced_data<-numerical_data %*% pca_1_2
reduced_data<-as.data.frame(reduced_data)
reduced_data$predicted <- pred_multi <- predict(multi_model,
newdata = df2[, 1:12], type = "class")
reduced_data$true_class <- df2$label
plot1 <- ggplot(reduced_data, aes(x = Comp.1, y = Comp.2, colour = true_class)) +
geom_point()
plot2 <- ggplot(reduced_data, aes(x = Comp.1, y = Comp.2, colour = predicted)) +
geom_point()
plot1
plot2
# install.packages('factoextra')
#library(factoextra)
fviz_eig(data.pca, addlabels = TRUE)
fviz_pca_var(data.pca, col.var = "black")
fviz_cos2(data.pca, choice = "var", axes = 1:2)
fviz_pca_var(data.pca, col.var = "cos2",
gradient.cols = c("black", "orange", "green"),
repel = TRUE)